1 Commented Header

# Course: BUAN 5210
# Title: Technical Appendix
# Purpose: Suggetion to motivae people to buy the assets managment or retirement product based on Basic EDA and detailed EDA
# Date: Mar 14th, 2019
# Author: Ying Xue

2 Clear Working Environment

# Clear environment of variables and functions
rm(list = ls(all = TRUE)) 

# library package
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0     ✔ readr   1.3.1
## ✔ tibble  2.1.3     ✔ purrr   0.3.2
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ ggplot2 3.2.0     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringr)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(htmlTable)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(shiny)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(mfx)
## Loading required package: sandwich
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: betareg
library(MASS)

3 Load data and prepare for EDA

# load data
raw_data <- read_csv("Personal_needs.csv")

# Data Manipulation

# select the columns of interest
research_data <-dplyr::select(raw_data,FinancialSecurity,tv,intrnet,socmed,charit,excer,garden,impulse,eatout,pubtran,sportfan,commun,envichoi,brandloyal,gender,age,jobstatus,Hhincome,retirement,educ,marstatus)

# remove the rows of na 
tidy_table <- research_data %>% na.omit() 

# convert character into factor
tidy_table <- tidy_table %>% mutate(
  gender = as.factor(gender),
  jobstatus= as.factor(jobstatus),
  educ = as.factor(educ),
  marstatus = as.factor(marstatus),
  Hhincome = as.factor(Hhincome),
    
# rename the category within the Hhincome according to the Census Bureau 2017
 Cat_Hhincome = case_when(
 Hhincome == "$20,000-$39,999"| Hhincome == "Below $20,000"~"Pov/Low",     
 Hhincome == "$40,000-$59,999"|Hhincome == "$40,000-$59,999"|Hhincome == "$60,000-$79,999"|Hhincome == "$80,000-$99,999"|Hhincome == "$100,000-$149,999" ~ "Middle",         
 Hhincome == "$150,000-$199,999"|Hhincome == "$200,000 & above"~"High"),
 Cat_Hhincome = as.factor(Cat_Hhincome),
 
# rename the category within the retirement 
 retirement = case_when(
 retirement == "$100,000-$149,999" ~ "100k-150k",         
 retirement == "$20,000-$59,999"~"20k-60k",             
 retirement == "$200,000-$299,999" ~ "200k-300k",               
 retirement == "$300,000-$399,999" ~"300k-400k",       
 retirement == "$400,000-$499,999" ~ "400k-500k",         
 retirement == "$500,000 and above" ~ ">=500k",          
 retirement == "$60,000-$99,999" ~ "60k-100k",
 retirement == "below $20,000"  ~ "<20k",TRUE ~ "NO"),
 Cat_retirement= case_when(retirement == "NO" ~ "NO", TRUE~"YES"),
 retirement = as.factor(retirement), Cat_retirement = as.factor( Cat_retirement))

# convert the age into category
  tidy_table$Cat_age<- cut(tidy_table$age, breaks=c(0,20, 30, 40, 50, 60 ,70,100), right = FALSE,labels=c("<20","20s","30s","40s","50s","60s",">=70"))

# convert FinancialSecurity Motivation into category 
 tidy_table$Cat_Fin<- cut(tidy_table$FinancialSecurity, breaks=c(2,4,5,8,9,10), right = FALSE,labels=c("not important","slightly important","moderate important","very important","extrem important"))

str(tidy_table) 
## Classes 'tbl_df', 'tbl' and 'data.frame':    366 obs. of  25 variables:
##  $ FinancialSecurity: num  8 7.67 6.33 8.33 8.33 ...
##  $ tv               : num  0 4 10 6 5 20 2 2 14 0 ...
##  $ intrnet          : num  2 5 7 3 2 10 1 2 5 25 ...
##  $ socmed           : num  0 1 0 1 0 2 0.5 0.5 2 5 ...
##  $ charit           : num  250 0 2000 2500 500 500 12000 0 200 400 ...
##  $ excer            : num  2 3 6 9 18 1 5 2 4 7 ...
##  $ garden           : num  0 0 6 4 2 0 0 0 0 2 ...
##  $ impulse          : num  300 500 2000 200 2000 2500 100 200 300 2500 ...
##  $ eatout           : num  10 5 4 1 15 30 12 6 2 1 ...
##  $ pubtran          : num  80 0 40 40 0 0 0 30 6 10 ...
##  $ sportfan         : num  1 7 5 6 4 5 1 5 5 3 ...
##  $ commun           : num  2 4 5 6 4 5 4 3 1 7 ...
##  $ envichoi         : num  3 5 6 6 7 6 4 5 5 7 ...
##  $ brandloyal       : num  2 5 2 1 1 7 5 3 5 1 ...
##  $ gender           : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 1 1 1 1 1 ...
##  $ age              : num  55 39 61 63 40 34 30 23 28 36 ...
##  $ jobstatus        : Factor w/ 5 levels "Administration",..: 5 2 2 1 2 2 4 4 4 4 ...
##  $ Hhincome         : Factor w/ 8 levels "$100,000-$149,999",..: 3 7 2 5 5 1 1 8 6 8 ...
##  $ retirement       : Factor w/ 9 levels "<20k",">=500k",..: 1 5 9 2 8 5 6 9 1 5 ...
##  $ educ             : Factor w/ 6 levels "college graduate",..: 2 2 2 2 2 2 2 1 1 1 ...
##  $ marstatus        : Factor w/ 6 levels "Divorced","Married",..: 5 2 2 2 5 2 2 5 2 1 ...
##  $ Cat_Hhincome     : Factor w/ 3 levels "High","Middle",..: 3 2 1 2 2 2 2 3 2 3 ...
##  $ Cat_retirement   : Factor w/ 2 levels "NO","YES": 2 2 1 2 2 2 2 1 2 2 ...
##  $ Cat_age          : Factor w/ 7 levels "<20","20s","30s",..: 5 3 6 6 4 3 3 2 2 3 ...
##  $ Cat_Fin          : Factor w/ 5 levels "not important",..: 4 3 3 4 4 3 5 4 3 1 ...
summary(tidy_table) 
##  FinancialSecurity       tv           intrnet          socmed      
##  Min.   :2.000     Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.:6.333     1st Qu.: 5.00   1st Qu.: 5.00   1st Qu.: 1.000  
##  Median :7.333     Median :10.00   Median :12.00   Median : 2.000  
##  Mean   :7.023     Mean   :15.51   Mean   :15.82   Mean   : 4.198  
##  3rd Qu.:8.000     3rd Qu.:21.00   3rd Qu.:20.00   3rd Qu.: 5.000  
##  Max.   :9.000     Max.   :80.00   Max.   :77.00   Max.   :40.000  
##                                                                    
##      charit            excer            garden          impulse       
##  Min.   :    0.0   Min.   : 0.000   Min.   : 0.000   Min.   :   0.00  
##  1st Qu.:   10.0   1st Qu.: 2.000   1st Qu.: 0.000   1st Qu.:  61.25  
##  Median :  100.0   Median : 4.000   Median : 0.000   Median : 200.00  
##  Mean   :  791.1   Mean   : 4.773   Mean   : 1.777   Mean   : 443.37  
##  3rd Qu.:  500.0   3rd Qu.: 7.000   3rd Qu.: 2.000   3rd Qu.: 500.00  
##  Max.   :15000.0   Max.   :30.000   Max.   :21.000   Max.   :5000.00  
##                                                                       
##      eatout         pubtran          sportfan         commun     
##  Min.   : 0.00   Min.   : 0.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 2.00   1st Qu.: 0.000   1st Qu.:2.000   1st Qu.:4.000  
##  Median : 3.00   Median : 0.000   Median :5.000   Median :4.500  
##  Mean   : 4.53   Mean   : 4.612   Mean   :4.216   Mean   :4.415  
##  3rd Qu.: 5.00   3rd Qu.: 3.000   3rd Qu.:6.000   3rd Qu.:6.000  
##  Max.   :30.00   Max.   :90.000   Max.   :7.000   Max.   :7.000  
##                                                                  
##     envichoi       brandloyal       gender         age       
##  Min.   :1.000   Min.   :1.000   Female:251   Min.   :18.00  
##  1st Qu.:4.000   1st Qu.:3.000   Male  :115   1st Qu.:28.00  
##  Median :5.000   Median :4.000                Median :43.00  
##  Mean   :4.989   Mean   :3.995                Mean   :43.22  
##  3rd Qu.:6.000   3rd Qu.:5.000                3rd Qu.:57.00  
##  Max.   :7.000   Max.   :7.000                Max.   :88.00  
##                                                              
##           jobstatus                Hhincome      retirement 
##  Administration:  6   $20,000-$39,999  :81   NO       :158  
##  Faculty       : 38   Below $20,000    :79   <20k     : 78  
##  Non-university:211   $40,000-$59,999  :65   20k-60k  : 36  
##  Staff         : 80   $80,000-$99,999  :44   100k-150k: 23  
##  Student       : 31   $100,000-$149,999:42   >=500k   : 21  
##                       $60,000-$79,999  :40   60k-100k : 20  
##                       (Other)          :15   (Other)  : 30  
##                    educ         marstatus    Cat_Hhincome Cat_retirement
##  college graduate    :121   Divorced : 31   High   : 15   NO :158       
##  graduate degree     : 84   Married  :165   Middle :191   YES:208       
##  high school graduate: 43   Other    :  4   Pov/Low:160                 
##  other               :  3   Partnered: 49                               
##  some college        :111   single   :108                               
##  some high school    :  4   Widowed  :  9                               
##                                                                         
##  Cat_age                 Cat_Fin   
##  <20 : 9   not important     : 13  
##  20s :92   slightly important: 18  
##  30s :63   moderate important:210  
##  40s :66   very important    : 86  
##  50s :61   extrem important  : 39  
##  60s :56                           
##  >=70:19
  • Observations:
  • 4 sections of data : value, behavior, attitude and demographic information
  • 366 observation after ommiting the rows containing na
  • nobody holds negative value towards Financial Security
  • 5 jobstatus
  • 8 level of household incom
  • 10 levels of retirment plan
  • 6 level of education
  • More than half of the respondents already got a retirement plan,that’s the potential customer to sell the product

4 Basic EDA

4.1 Univariate Data Exploration

4.1.1 Univariate Non-Graphical

#Create function for frequency tables 
count_table <- function(x,colname){
   x = enquo(x)
   kable(
    tidy_table %>%
      tabyl(!!x) %>%
      adorn_totals()%>%
      adorn_pct_formatting(digits = 0 ),
      digits = 2,
      format = "html",
      align = c("l","c","c"),
      col.names = c(colname,"Count","Total")
    )%>%
  kable_styling(full_width = F)}

#Make count tables for univariate variables for segmentation
count_table(Cat_Fin,"Value of Financial Security")
Value of Financial Security Count Total
not important 13 4%
slightly important 18 5%
moderate important 210 57%
very important 86 23%
extrem important 39 11%
Total 366 100%
count_table(Cat_age,"Age")
Age Count Total
<20 9 2%
20s 92 25%
30s 63 17%
40s 66 18%
50s 61 17%
60s 56 15%
>=70 19 5%
Total 366 100%
count_table(Cat_Hhincome,"Household Income") 
Household Income Count Total
High 15 4%
Middle 191 52%
Pov/Low 160 44%
Total 366 100%
count_table(retirement,"Retirment Plan")
Retirment Plan Count Total
<20k 78 21%
>=500k 21 6%
100k-150k 23 6%
200k-300k 14 4%
20k-60k 36 10%
300k-400k 13 4%
400k-500k 3 1%
60k-100k 20 5%
NO 158 43%
Total 366 100%
count_table(marstatus,"Marital Status")
Marital Status Count Total
Divorced 31 8%
Married 165 45%
Other 4 1%
Partnered 49 13%
single 108 30%
Widowed 9 2%
Total 366 100%
count_table(gender,"Gender")
Gender Count Total
Female 251 69%
Male 115 31%
Total 366 100%
count_table(educ,"Education Attainment")
Education Attainment Count Total
college graduate 121 33%
graduate degree 84 23%
high school graduate 43 12%
other 3 1%
some college 111 30%
some high school 4 1%
Total 366 100%
  • Finding of the data: ++ more than 50% of the people think that Financial Security is moderate impotant and have joined a retirement plan and 33% of people think Financial secrutiy is very or extreme important, these maybe potential customers

4.1.2 Univariate Graphical

We begin with exploration of the categorical variables.

# Code histograms using grid.arrange so can see all variables together
grid.arrange(
  # distribution by retirement
  tidy_table %>% 
      ggplot(aes(retirement))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)), 
  tidy_table %>% 
      ggplot(aes(Cat_retirement))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)), 
 
   # distribution by value of Financial Security 
   tidy_table %>% 
      ggplot(aes(Cat_Fin))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
    ncol=2
)

# Demographical Information
grid.arrange(
  # distribution by gender
  tidy_table %>% 
      ggplot(aes(gender))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
  # distribution by jobstatus
   tidy_table %>% 
      ggplot(aes(jobstatus)) +  
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
  # distribution by educ
   tidy_table %>% 
      ggplot(aes(educ))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
   # distribution by marstatus
   tidy_table %>% 
      ggplot(aes(marstatus))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
   # distribution by Hhincome
  tidy_table %>% 
      ggplot(aes(Cat_Hhincome))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),
    # distribution by Age
  tidy_table %>% 
      ggplot(aes(Cat_age))+
      geom_bar() + 
      theme(axis.text.x = element_text(angle=60, hjust=1)),

  
  ncol=2
)

  • Finding of the data: ++ Most people are with middle or low house holdincome ++ more than 50% people join the retirement plan but most are witn low balance

Having completed our examination of the categorical variables, we move on to the numeric variables.

# Code histograms using grid.arrange so can see all quant variables together 
grid.arrange(
   # Value of Financial Security distribution
  tidy_table  %>% 
    ggplot(aes(FinancialSecurity)) +
      geom_histogram(),
   # Attitude of Sportfan distribution
  tidy_table  %>% 
    ggplot(aes(sportfan)) +
      geom_histogram(),
   # Attitude of Community connected distribution
  tidy_table  %>% 
    ggplot(aes(commun)) +
      geom_histogram(),
   # Attitude of Environment Friendly distribution
  tidy_table  %>% 
    ggplot(aes(envichoi)) +
       geom_histogram(),
   # Behavior of money on charities
  tidy_table  %>% 
    ggplot(aes(charit)) +
      geom_histogram(),
   # Behavior of  time on whatching TV distribution
  tidy_table  %>% 
    ggplot(aes(tv)) +
      geom_histogram(),
   # Behavior of  time on surfing internet distribution
  tidy_table  %>% 
    ggplot(aes(intrnet)) +
       geom_histogram(),
   # Behavior of  time on social media distribution
  tidy_table  %>% 
    ggplot(aes(socmed)) +
      geom_histogram(),
  # Behavior of time on exercise distribution
  tidy_table  %>% 
    ggplot(aes(excer)) +
      geom_histogram()
   

)

  • Finding of the data: ++ Many people are super sports fans and strongly connected to community, the marketing activity may focus on these targets to attract sportsfan family and get higher referrals through members in the community. ++ Many people spent a lot of time on tv and internet, it canbe the main marketing chennel.

4.2 Multivariate Data Exploration

4.2.1 Non-graphical

First, we examine overall characteristics using cross-tabs and table

# education and retirement plan balance
tidy_table %>% 
  tabyl(educ,Cat_retirement) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>%  
  adorn_pct_formatting(digits = 0)
##                  educ  NO YES Total
##      college graduate 11% 22%   33%
##       graduate degree  5% 17%   23%
##  high school graduate  8%  4%   12%
##                 other  0%  1%    1%
##          some college 18% 13%   30%
##      some high school  1%  1%    1%
##                 Total 43% 57%  100%
# gender and retirement plan balance
tidy_table %>% 
  tabyl(gender,retirement) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>%  
  adorn_pct_formatting(digits = 0)
##  gender <20k >=500k 100k-150k 200k-300k 20k-60k 300k-400k 400k-500k
##  Female  15%     3%        5%        2%      6%        3%        0%
##    Male   6%     2%        2%        2%      4%        1%        1%
##   Total  21%     6%        6%        4%     10%        4%        1%
##  60k-100k  NO Total
##        4% 30%   69%
##        1% 13%   31%
##        5% 43%  100%
# household income and retirement plan balance
tidy_table %>% 
  tabyl(Cat_Hhincome,Cat_retirement) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>%  
  adorn_pct_formatting(digits = 0)
##  Cat_Hhincome  NO YES Total
##          High  1%  3%    4%
##        Middle 14% 38%   52%
##       Pov/Low 28% 16%   44%
##         Total 43% 57%  100%
# marital status and Value of Financial Security
tidy_table %>% 
  tabyl(marstatus,Cat_Fin) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>%  
  adorn_pct_formatting(digits = 0)
##  marstatus not important slightly important moderate important
##   Divorced            1%                 0%                 4%
##    Married            1%                 2%                28%
##      Other            0%                 0%                 1%
##  Partnered            1%                 1%                 8%
##     single            1%                 2%                17%
##    Widowed            0%                 0%                 0%
##      Total            4%                 5%                57%
##  very important extrem important Total
##              3%               1%    8%
##             10%               4%   45%
##              1%               0%    1%
##              3%               1%   13%
##              6%               3%   30%
##              1%               1%    2%
##             23%              11%  100%
# marital status and retirement plan balance
tidy_table %>% 
  tabyl(marstatus,Cat_retirement) %>% 
  adorn_totals(where = c("row", "col")) %>% 
  adorn_percentages(denominator = "all") %>%  
  adorn_pct_formatting(digits = 0)
##  marstatus  NO YES Total
##   Divorced  4%  5%    8%
##    Married 13% 32%   45%
##      Other  1%  1%    1%
##  Partnered  8%  5%   13%
##     single 16% 13%   30%
##    Widowed  1%  2%    2%
##      Total 43% 57%  100%
  • Finding of the data: ++ For educational attainment, with graduate degree the percentage of joining the retirement plan is three times that of not joining the plan,with college degree the percentage of joining the retirement plan is two times that of not joining the plan.It seems that the higher the educational attainment, the people maybe more willing to join the retirement plan. ++ For household income, the Middle Income people get the highest retirement plan particaption rate with 38%, and among the middle income group, people are more willing to join the plan with the percentage of joining the plan approximately 3 times higher than not joining the plan ++ For marital status, among the group of married and widowed people, the probability for retirement plan partipation is higher, and the widowed people value most for finacial security with respondents falls on either very or exremely important.

4.2.2 graphical

Now, we move on to understanding covariance graphically and to understanding the relationships among numeric variables.

# Use tile graph to show which group value Financial Security most

ggplotly(p<- tidy_table %>%
  group_by(gender,Cat_Hhincome) %>% 
  summarise(Fina = mean(FinancialSecurity)) %>% 
  ggplot(aes(gender,Cat_Hhincome)) + 
  geom_tile(aes(fill = -Fina)))
ggplotly(p<- tidy_table %>%
  group_by(marstatus,gender) %>% 
  summarise(Fina = mean(FinancialSecurity)) %>% 
  ggplot(aes(marstatus,gender)) + 
  geom_tile(aes(fill = -Fina)))
ggplotly(p<- tidy_table %>%
  group_by(marstatus,Cat_age) %>% 
  summarise(Fina = mean(FinancialSecurity)) %>% 
  ggplot(aes(marstatus,Cat_age)) + 
  geom_tile(aes(fill = -Fina)))
ggplotly(p <- tidy_table %>%
  group_by(educ,marstatus) %>% 
  summarise(Fina = mean(FinancialSecurity)) %>% 
  ggplot(aes(educ, marstatus)) + 
  theme(axis.text.x = element_text(angle=60, hjust=1)) +
  geom_tile(aes(fill = -Fina)))

+ Finding of the data: ++ The following segments of people value Financial Security most: Middleincome Female 7.27, WidowFemale 7.37, Divorced Male 7.73, 60s widowed 9, widowed with graduate degree 8.5 with highschool degree 9

# Use cor to get the correlation between the variable with interest
pairs_data<-tidy_table%>%dplyr::select(FinancialSecurity,tv,intrnet,socmed,charit,excer,sportfan,commun,envichoi,gender,age,Cat_Hhincome,marstatus)
ggpairs(pairs_data)

  • Interesting correlations finding: ++ there’s a positive relationship between age and value of Financial security , time on TV as well as money on charity, but negative relationship with that to community connected and time on exercise and sport
  • there’s strong relationship with time spent TV and Internet ++ the attitude towards commnity, environment and sportsfans is positive correlative

From these graphs, it’s unclear which targets should be made and how the marketing should be positioned, I will research this further in the detailed EDA.

# Boxplots of Finacialsecurity by gender, marital status, householdincome and education
grid.arrange(
  # Finacialsecurity by gender
   tidy_table %>%
    ggplot(aes(x = gender, y = FinancialSecurity)) +
    geom_boxplot() +
    coord_flip(),
  # Finacialsecurity by marital status
   tidy_table %>%
    ggplot(aes(x = marstatus, y = FinancialSecurity)) +
    geom_boxplot() +
    coord_flip(),
   # Finacialsecurity by Hhincome
   tidy_table %>%
    ggplot(aes(x = Cat_Hhincome, y = FinancialSecurity)) +
    geom_boxplot() +
    coord_flip(),
    # Finacialsecurity by Education
   tidy_table %>%
    ggplot(aes(x = educ, y = FinancialSecurity)) +
    geom_boxplot() +
    coord_flip(),
  ncol = 2
)

  • finding from the graph: ++ widowed value FinancialSecuriy most

5 Detailed and Statistical EDA

For this section of the exploration, we examine the statistical validity of the most interesting findings discussed in the basic EDA.

Question1: Is the participation of the retirement plan related to the higher value of Financial Security?

g1 <- tidy_table %>% 
  group_by(Cat_Fin,Cat_retirement) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = Cat_Fin, y = count, fill = Cat_retirement)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "retirement plan", y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "top", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
     ggtitle("Retirement Plan Distribution By value of Financial Security\nTarget at the people with high value of Financial Security")

g1

  • the people with higher value in Financial Security would be more likely to join the plan

Question2: Is the participation of the retirement plan related to the marital status?

# remove  "other" information in marital status

g2 <- tidy_table %>%filter(marstatus!="Other"&gender=="Female")%>%
  group_by(marstatus,Cat_retirement) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = marstatus, y = count, fill = Cat_retirement)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "retirement plan",y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "top", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
  ggtitle("Retirement Plan Distribution By Marital Status",
          subtitle = "Target at the people for widowed or married people")

g2

  • the married, Divoced and widowed people share high likelihood to join the plan

Question3: Is the participation of the retirement plan related to age?

g3 <-
  tidy_table %>% 
  group_by(Cat_age, Cat_retirement) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = Cat_age, y = count, fill = Cat_retirement)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "retirement plan", y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "top", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_x_discrete(limits=c("<20","20s",">=70","40s","30s","60s","50s")) +
    scale_fill_brewer(palette = "Blues") +
    ggtitle("Retirement Plan Distribution By Age\nTarget at people in 50s and 60s")

g3

  • it’s not surprising that 50s to 60s people are more likely to join/remain in the retirement plan, since the 66 is the full retirement age, if people withdraw before that age may face high reductions of tax, and 71 is the impative age to withdraw the retirment plan.

Question4: Is the participation of the retirement plan related to age?

g4 <-
  tidy_table %>% 
  group_by(educ, Cat_retirement,gender) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = educ, y = count, fill = Cat_retirement)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "retirement plan", y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "top", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
    ggtitle("Retirement Plan Distribution By Education",
          subtitle = "higher edcuation maybe more likely to join the plan")

g4

  • even it seems more likely to join the plan with higher education, but there are missing information for others

Question5: How Value of Financial Security differentiated by age?

g5 <-tidy_table %>% 
  group_by(Cat_age, Cat_Fin) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = Cat_age, y = count, fill =Cat_Fin)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "Value", y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "top", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
    scale_x_discrete(limits=c("<20",">=70","20s","30s","40s","60s","50s")) +
    ggtitle("Value of Financial Security By Age\n 50s is a good target - more than 98% think it important")
g5

  • people at 50s value Financial Security Most

Question6: How Value of Financial Security differentiated by marital status?

g6 <-tidy_table %>% 
  group_by(marstatus, Cat_Fin) %>%filter(marstatus!="Other")%>%
  summarise(count= n()) %>%
  ggplot(aes(x = marstatus, y = count, fill =Cat_Fin)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "Value", y = "Percentage") +
    
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "right", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
    ggtitle("Value of Financial Security By Marital Status\nwidowed peopleis a good target- most of them think it extrem or very important")+
    scale_x_discrete(limits=c("Widowed","Divorced","Married","single","Partnered")) 

g6

  • Most widowed people value Financial Security extrem or very imporance

Question7: How Value of Financial Security differentiated by household income?

g7 <-tidy_table %>% 
  group_by(Cat_Hhincome, Cat_Fin) %>%
  summarise(count= n()) %>%
  ggplot(aes(x = Cat_Hhincome, y = count, fill =Cat_Fin)) +
    geom_bar(stat = "identity", position = "fill") + 
    labs(fill = "Value", y = "Percentage") +
    coord_flip() +
    scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
    theme_classic() +
    guides(fill = guide_legend(reverse = TRUE)) +
    theme(legend.position = "right", 
          legend.justification = "center",
          legend.title = element_text(face = "bold"),
          axis.title.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.line.y = element_blank()) +
    scale_fill_brewer(palette = "Blues") +
    ggtitle("Value of Financial Security By Household Income",
          subtitle = "Most widowed people value Financial Security extrem or very imporance")

g7

  • people at from Middle class value Financial Security Most

Question8: how participation of the retirement plan related to both the Householdincome and marital status?

g8 <- tidy_table %>%  filter(marstatus!="Other") %>% 
group_by(Cat_Hhincome,marstatus,Cat_retirement) %>%
   summarise(count = n()) %>% 
  ggplot(aes(x = marstatus, y = count, fill=Cat_retirement)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_classic() +
   coord_flip() +
  labs(fill = "Retirement Plan",y="count") +
  theme(axis.text = element_text(face = "bold", size = 9),
        axis.title = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.line.x = element_blank(),
        legend.position = "right",
        legend.title = element_text(face = "bold")) +
  ggtitle("participation of the retirement plan by household income and marital status\n married people with Middlehousehold income is a large potential ") +
    scale_fill_brewer() +
    facet_grid(Cat_Hhincome ~ .)
  
g8

  • so the married people with middle householdincome is our target, this segment have most populations and and large retirement participation rate

Question9: Is the participation of the retirement plan related to both the age and marital status?

g9 <- tidy_table %>%  filter(Cat_age!="<20",marstatus!="Other") %>% 
group_by(Cat_age,marstatus,Cat_retirement) %>%
   summarise(count = n()) %>% 
    mutate(total = sum(count)) %>% 
    mutate(percent = count/total) %>% 
  ggplot(aes(x = marstatus, y = percent, fill=Cat_retirement)) +
  geom_bar(stat = "identity", position = "fill") +
  theme_classic() +
   coord_flip() +
  labs(fill = "Retirement Plan") +
      scale_y_continuous(labels = percent_format(accuracy = 1, suffix = "%")) +
  theme(axis.text = element_text(face = "bold", size = 9),
        axis.title = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.line.x = element_blank(),
        legend.position = "top",
        legend.title = element_text(face = "bold")) +
  ggtitle("participation of the retirement plan by age and marital status",
          subtitle = "100% participation for 50s,60s widowed & 30s,70s divorced people") +
    scale_fill_brewer() +
   facet_wrap(.~ Cat_age) 
  
g9

  • people at 50s,60s widowed & 40s,70s divorced people participate the retirement plan with 100%
  • so the divorced people in 30s,70s and widowed people in 50s,60s our target customer

6 Statistic analysis

# remove age under 20, because 21 is the leagal age to join retirement plan
z <- qnorm(.95)
q_g1 <- tidy_table %>% 
group_by(marstatus, Cat_retirement) %>%
  summarise(Fina = mean(FinancialSecurity), sd = sd(FinancialSecurity), 
            n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = marstatus, y = Fina, fill = Cat_retirement)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  geom_errorbar(aes(ymin = Fina - ci, ymax = Fina + ci), 
                width = 0.5, position = position_dodge(0.9)) +
  theme_classic() +
  labs(fill = "Retirement Plan") +
  theme(axis.text = element_text(face = "bold", size = 11),
        axis.title = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.line.x = element_line(colour = "grey"),
        legend.position = "top",
        legend.title = element_text(face = "bold")) +
  scale_fill_brewer() +
  ggtitle("Average Financial Security by Marital Status and Retirement Plan Participation",
          subtitle = "95% Confidence Interval ")

q_g1

  • 95% Confidence Interval of Expcted Financial Security Conditional on Marital Status and Retirement Plan Participation
# remove age under 20, because 21 is the leagal age to join retirement plan
z <- qnorm(.95)
q_g2 <- tidy_table %>% filter(Cat_age!="<20") %>% 
group_by(Cat_age, Cat_retirement) %>%
  summarise(Fina = mean(FinancialSecurity), sd = sd(FinancialSecurity), 
            n = n(), ci = z * sd/sqrt(n)) %>%
  ggplot(aes(x = Cat_age, y = Fina, fill = Cat_retirement)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  geom_errorbar(aes(ymin = Fina - ci, ymax = Fina + ci), 
                width = 0.5, position = position_dodge(0.9)) +
  theme_classic() +
  labs(fill = "Retirement Plan") +
  theme(axis.text = element_text(face = "bold", size = 11),
        axis.title = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank(),
        axis.line.x = element_line(colour = "grey"),
        legend.position = "top",
        legend.title = element_text(face = "bold")) +
  scale_fill_brewer() +
  ggtitle("Average Financial Security by Age and Retirement Plan Participation",
          subtitle = "95% Confidence Interval ")

q_g2

  • 95% Confidence Interval of Expcted Financial Security Conditional on Age and Retirement Plan Participation

Logit model and calculate odd ratios

tidy_table <- tidy_table %>% mutate(Part_Retirement = case_when(Cat_retirement == "NO" ~ "0" , TRUE~ "1"), Part_Retirement =as.numeric(Part_Retirement ))

# Logtit model
 Logit <- glm( Part_Retirement ~ Cat_Fin+Cat_age+jobstatus+Hhincome+marstatus, 
               family = binomial(link = "logit"), 
                data = tidy_table)
summary(Logit)
## 
## Call:
## glm(formula = Part_Retirement ~ Cat_Fin + Cat_age + jobstatus + 
##     Hhincome + marstatus, family = binomial(link = "logit"), 
##     data = tidy_table)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4736  -0.8970   0.4444   0.8361   2.2063  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.88169    1.95784  -0.450  0.65247    
## Cat_Finslightly important -0.44567    0.91137  -0.489  0.62483    
## Cat_Finmoderate important -0.53801    0.73774  -0.729  0.46584    
## Cat_Finvery important     -0.33175    0.76293  -0.435  0.66368    
## Cat_Finextrem important    0.38989    0.82443   0.473  0.63627    
## Cat_age20s                 2.59199    1.30874   1.981  0.04765 *  
## Cat_age30s                 2.77253    1.38421   2.003  0.04518 *  
## Cat_age40s                 2.77059    1.38458   2.001  0.04539 *  
## Cat_age50s                 3.64575    1.40883   2.588  0.00966 ** 
## Cat_age60s                 3.66620    1.42334   2.576  0.01000 *  
## Cat_age>=70                2.71593    1.50274   1.807  0.07071 .  
## jobstatusFaculty           0.33505    1.25759   0.266  0.78992    
## jobstatusNon-university   -0.79140    1.16236  -0.681  0.49596    
## jobstatusStaff            -0.67297    1.24412  -0.541  0.58856    
## jobstatusStudent           0.77911    1.27867   0.609  0.54232    
## Hhincome$150,000-$199,999  0.21085    1.20148   0.175  0.86069    
## Hhincome$20,000-$39,999   -0.95116    0.50074  -1.900  0.05750 .  
## Hhincome$200,000 & above   0.64665    1.37819   0.469  0.63893    
## Hhincome$40,000-$59,999   -0.17911    0.50263  -0.356  0.72159    
## Hhincome$60,000-$79,999   -0.48364    0.53942  -0.897  0.36994    
## Hhincome$80,000-$99,999    0.22075    0.57234   0.386  0.69972    
## HhincomeBelow $20,000     -2.17140    0.53078  -4.091  4.3e-05 ***
## marstatusMarried          -0.04654    0.50715  -0.092  0.92689    
## marstatusOther            -0.10663    1.26817  -0.084  0.93299    
## marstatusPartnered        -0.67002    0.58297  -1.149  0.25043    
## marstatussingle           -0.07409    0.53681  -0.138  0.89022    
## marstatusWidowed           0.54053    0.97805   0.553  0.58050    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 500.53  on 365  degrees of freedom
## Residual deviance: 388.38  on 339  degrees of freedom
## AIC: 442.38
## 
## Number of Fisher Scoring iterations: 5
# calculate odd ratios
logitor(Part_Retirement ~ Cat_Fin+Cat_age+jobstatus+Hhincome+marstatus,data = tidy_table)
## Call:
## logitor(formula = Part_Retirement ~ Cat_Fin + Cat_age + jobstatus + 
##     Hhincome + marstatus, data = tidy_table)
## 
## Odds Ratio:
##                           OddsRatio Std. Err.       z     P>|z|    
## Cat_Finslightly important  0.640395  0.583638 -0.4890   0.62483    
## Cat_Finmoderate important  0.583909  0.430773 -0.7293   0.46584    
## Cat_Finvery important      0.717665  0.547528 -0.4348   0.66368    
## Cat_Finextrem important    1.476824  1.217538  0.4729   0.63627    
## Cat_age20s                13.356301 17.479952  1.9805   0.04765 *  
## Cat_age30s                15.998998 22.146043  2.0030   0.04518 *  
## Cat_age40s                15.967987 22.109019  2.0010   0.04539 *  
## Cat_age50s                38.311385 53.974378  2.5878   0.00966 ** 
## Cat_age60s                39.103065 55.656855  2.5758   0.01000 *  
## Cat_age>=70               15.118595 22.719314  1.8073   0.07071 .  
## jobstatusFaculty           1.398005  1.758115  0.2664   0.78992    
## jobstatusNon-university    0.453209  0.526791 -0.6809   0.49596    
## jobstatusStaff             0.510192  0.634740 -0.5409   0.58856    
## jobstatusStudent           2.179534  2.786902  0.6093   0.54232    
## Hhincome$150,000-$199,999  1.234729  1.483507  0.1755   0.86069    
## Hhincome$20,000-$39,999    0.386293  0.193431 -1.8995   0.05750 .  
## Hhincome$200,000 & above   1.909129  2.631151  0.4692   0.63893    
## Hhincome$40,000-$59,999    0.836016  0.420208 -0.3563   0.72159    
## Hhincome$60,000-$79,999    0.616538  0.332574 -0.8966   0.36994    
## Hhincome$80,000-$99,999    1.247009  0.713711  0.3857   0.69972    
## HhincomeBelow $20,000      0.114018  0.060519 -4.0909 4.296e-05 ***
## marstatusMarried           0.954531  0.484093 -0.0918   0.92689    
## marstatusOther             0.898862  1.139912 -0.0841   0.93299    
## marstatusPartnered         0.511698  0.298306 -1.1493   0.25043    
## marstatussingle            0.928587  0.498475 -0.1380   0.89022    
## marstatusWidowed           1.716909  1.679229  0.5527   0.58050    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Indeed, here I find statistical confirmation of some of the visual exploration.

For the age category, at the 1% significance level, the estimated odds of participating the financial retirement plan for people in 50s and 60s is 38.31 and 39.1 times higher compared to the group under 20s. For the Household income category, it indicate that the lower the income, the less likely a person would join the retirement plan, the estimated odds of participating the plan for people with HhincomeBelow 20,000 is 88.6% lower than that of the people with HhincomeBelow over 200,000. For the Marital Status, it shows that the estimated odds for Widowed person to partipate the plan is 1.7 times higher than the divorced one, however, this result is not statistically significant at 10% level.

7 Findings and Recommendations

  • The binary response model shows that holding other variables constont, people with age of 50s and 60s have higher odds of participating in retirement plan. The regression results also indicate that family with low household income have lower odds participation in retirement plan.
  • I identified the impacts of demographics and socioeconomic variables on demand for retirement plan and suggested that marketing campagin should target people with the following characteristics: high value of financial security, middle income, and those who are widowed or divorced.

8 Save File

# Save the rds file so I can reuse anything from this file in another file
save.image("Final_TA.RData")